###################################################################################
##############################1: Load and Merge Data###############################
###################################################################################

library(dplyr)
library(reshape2)
library(sandwich)
library(lmtest)
library(feather)
library(data.table)
library(ggplot2)
library(stargazer)
library(lfe)
library(broom)
library(plm)
library(ggthemes)
library(data.table)
library(dplyr)



###########################################
####1 -- Data IMPORT AND CLEANING###########
############################################
load("Data/white_minwage.RData")

#Start with prez results
prez_results <- read.csv("Data/Prez County Results (1920-2016).csv", stringsAsFactors = F)
abbrevs <- unique(df.state[,c("state", 'state.abv')])
prez_results <- left_join(prez_results, abbrevs, by=c("State"="state"))

#easiest to just send everything to upper case for merging later since capitalization can be inconsistent
prez_results$Area <- toupper(prez_results$Area)
prez_results$Area <- gsub(" COUNTY", "", prez_results$Area)
prez_results$year <- as.numeric(substr(prez_results$RaceDate, 1,4))


fips <- readRDS("Data/Counties Votes And Fips.RDS")
df.county <- left_join(prez_results, unique(fips[,c('State', 'state.abv', 'Area', 'fips')]))

#There's this one county in sd that changed names. Let's make that consistent
df.county$Area[df.county$Area == 'SHANNON' & df.county$state.abv == "SD"] <- "OGLALA LAKOTA"

df.county <- df.county[df.county$year >= 1980,]
df.county <- df.county[!grepl('special',df.county$Area, ignore.case=T) &
                         !grepl('not reported',df.county$Area, ignore.case=T) &
                         !grepl('absentee',df.county$Area, ignore.case=T), ]


#Fold in minimum wage data
#Want fed and state
min <- read.csv("Data/fed_min.csv", stringsAsFactors=F)
fed_min <- c(as.numeric(min[1,]), 3.35)
fed_years <- c(gsub("X", "", colnames(min)), 1984)
fed_min <- data.frame(fed_min=fed_min, year=fed_years)
fed_min$year <- as.numeric(as.character(fed_min$year))
df.county <- left_join(df.county, fed_min, by=c("year"))
df.county$fed_min[df.county$year == 1996] <- 4.75
df.county$fed_min[df.county$year == 2008] <- 6.55

#now do state min
minKT <- read.csv("Data/state_min_wages.csv")
df.county <- left_join(df.county, minKT, by=c("state.abv"= "st", 'year'))


df.county$fips <- as.character(df.county$fips)

#Next population data
pop <- read.csv("Data/county_population.csv", stringsAsFactors=F,
                colClasses = c(rep("character",10), rep("numeric", 47)))

#need to clean up the fips
pop$fips[nchar(pop$fips) ==4] <- paste0(0, pop$fips[nchar(pop$fips) ==4] )

#This dataset  has multiple
#rows for each county with NA entries in different years for each
#use aggregate to clean it up.
#In some cases there are duplicate values. Taking the average is ok
#Maybe should do something better, but I don't think we're even using this
agg.fun <- function(x){
  if(!is.numeric(x)){
    unique(x)[1]
  }
  else{
    mean(x, na.rm=T)
  }
}

pop <- aggregate(pop, by = list((pop$fips)), agg.fun)
pop <- reshape2::melt(pop, id.vars = "Group.1")
pop$variable <- as.character(pop$variable)
pop <- pop[grepl('pop', pop$variable),]
pop$variable <- gsub('pop', '', pop$variable)
colnames(pop) <- c("fips", 'year', 'pop')
pop$year <- as.numeric(pop$year)
pop <- pop[pop$year< 2010,]

#Was missing 2016 in the other file, so fold in new data for that
pop_new <- read.csv("Data/county_pop_2010_2017.csv", stringsAsFactors=F)
pop_new <- reshape2::melt(pop_new, id.vars = c('Fips', "Id","Name"))
pop_new$variable <- gsub("X","", pop_new$variable)
pop_new <- pop_new[,c("Fips", "variable", "value"),]

colnames(pop_new) <- c("fips", "year", 'pop')
pop <- rbind(pop, pop_new)

pop$pop <- as.numeric(pop$pop)
pop$year <- as.numeric(pop$year)

for(i in 1:length(pop$fips)){df.county$fed_min[df.county$year == 1996] <- 4.75
if(nchar(pop$fips[i])==4){
  pop$fips[i] <- paste0("0", pop$fips[i])
}
}


df.county$fips <- as.character(df.county$fips)
df.county <- left_join(df.county, pop, by=c('year', 'fips'))


df.county$turnout.pop <- as.numeric(df.county$TotalVotes)/df.county$pop
df.county$TotalVotes <- as.numeric(df.county$TotalVotes)


#and vap estimates -- pretty sure these were interpolated in a different R script
vap <- fread('Data/vap.csv',
             colClasses = c("character", 'numeric', 'numeric'))

vap$year <- as.numeric(vap$year)

df.county <- left_join(df.county, vap, by=c('fips'='county', 'year'))
df.county$turnoutvap <- df.county$TotalVotes/df.county$Over19

#make lagged dfs
lag_turnout.df <- df.county[,c("fips", 'year', 'state.abv', 'statemin', 'turnoutvap')]
lag_turnout.df$year <- lag_turnout.df$year + 4
colnames(lag_turnout.df) <- c("fips", 'year', 'state.abv', 'lag.min', 'turnoutvap.lag')

df.county <- df.county[!is.na(df.county$fips),]
df.county <- left_join(df.county, lag_turnout.df)

#function to generate some model output given a dataframe.
df.county$d.vap <- df.county$turnoutvap - df.county$turnoutvap.lag
df.county$d.min <- df.county$statemin - df.county$lag.min
df.county$ratio <- df.county$d.min/df.county$lag.min
df.county <- data.frame(df.county)


###################################################################################
###########################2 -- Make Bordering Counties DF#########################
###################################################################################

#Can make the bordering counties dataframe now
#load bordering counties
df.county$keep <- FALSE
bordering <- read.delim("Data/Bordering_counties.txt", sep='\t', header=F, stringsAsFactors = F)
colnames(bordering) <- c("county", "FIPS", "Neighboring_counties", "Neighboring_Fips")

#Dataset lists each county once before enumerating all its neighbors. Let's fill the
#that in so that each row lists a pair of bordering counties
for(i in 2:nrow(bordering)){
  if(is.na(bordering$FIPS[i])){
    bordering$FIPS[i] <- bordering$FIPS[i-1]
  }
}

#5 digit fips codes
bordering$FIPS[nchar(bordering$FIPS) == 4] <- paste0(0, bordering$FIPS[nchar(bordering$FIPS) == 4])
bordering$Neighboring_Fips[nchar(bordering$Neighboring_Fips) == 4] <-
  paste0(0, bordering$Neighboring_Fips[nchar(bordering$Neighboring_Fips) == 4])

#Some rows will have the same county twice -- obviously this is silly
bordering <- bordering[bordering$FIPS != bordering$Neighboring_Fips,]
state_fips <- substr(bordering$FIPS,1,2)
neighbor_state_fips <- substr(bordering$Neighboring_Fips,1,2)
#subset to pairs that cross state lines
bordering <- bordering[state_fips != neighbor_state_fips,]
#now only keep counties with borders on state lines
df.county$keep <- (df.county$fips %in% bordering$FIPS)
df.borders <- df.county[df.county$keep,]


bordering_2 <- bordering
bordering$ab <- paste0(bordering$FIPS, bordering$Neighboring_Fips)
bordering$ba <- paste0(bordering$Neighboring_Fips, bordering$FIPS)
bordering_2$ab <- bordering$ba
bordering_2$ba <- bordering$ab
bordering <- rbind(bordering, bordering_2)
bordering <- bordering[!duplicated(bordering[,'ba'])&!duplicated(bordering[,'ab']),]

#We've now enumerated every possible pair -- let's add an id for each
bordering$pair <- 1:nrow(bordering)
bordering_new <- rbind(as.matrix(bordering[,c('FIPS', 'pair')]), 
                       as.matrix(bordering[,c('Neighboring_Fips', 'pair')]))

colnames(bordering_new) <- c('fips', 'pair')
bordering_new <- as.data.frame(bordering_new)
df.borders <- left_join(bordering_new, df.borders)
df.borders <- df.borders[!is.na(df.borders$statemin),]


###################################################################################
###########################3 -- Fit County Models #################################
###################################################################################

rat_name <- "Min Wage Proportional Increase"

mean(df.county$real.inc, na.rm=T)
summary(df.county$real.inc)
hist(df.county$real.inc)


stargazer(
  felm(data=df.county, formula = d.vap ~ ratio|0|0|state.abv, exactDOF=T),
  felm(data=df.county, formula = d.vap ~ ratio|factor(fips):year|0|state.abv, exactDOF=T),
  felm(data=df.borders,
       formula = d.vap ~ ratio|pair + factor(fips):year |0|pair + state.abv, exactDOF=T),
  dep.var.labels = "Change in Turnout",
  covariate.labels = c(rat_name,"Intercept"),
  add.lines = list(
    c("County Time Trends", "No", "Yes", "Yes"),
    c("Bordering County Pair Fixed Effects", "No", "No", "Yes")),
  star.cutoffs = c(.05, NA, NA),
  star.char=c("*",NA,NA) ,
  dep.var.labels.include = T,
  notes = c("* p $ \\leq $ .05", "SE's clustered by state in all models", 
            "SE's clustered by bordering county pair in model 3"),
  notes.append=F,
  label='main', title='Main Estimates from Panel Data', out ="Tables/main_with_borders.tex")


mods <- do.call(rbind, lapply(list(  felm(data=df.county, formula = d.vap ~ ratio|0|0|state.abv, exactDOF=T),
                                     felm(data=df.county, formula = d.vap ~ ratio|factor(fips):year|0|state.abv, exactDOF=T),
                                     felm(data=df.borders[!duplicated(df.borders[,c('fips', 'year')]),],
                                          formula = d.vap ~ ratio|factor(pair)  + factor(fips):year + state.abv|0|
                                            state.abv + pair, exactDOF=T)),tidy))

mods <- mods[grepl('ratio', mods$term),]
mods$Model <- factor(c("Base Model", "County Time Trends Model", 'Bordering Counties Model'))
levels(mods$Model) <- rev(c('Base Model',"County Time Trends Model", 'Bordering Counties Model'))
mods$lower <- mods$estimate - 1.96*mods$std.error
mods$upper <- mods$estimate + 1.96*mods$std.error


pdf("Figures/panel_coef.pdf")
ggplot(data=mods, aes(y=estimate, ymin=lower, ymax=upper, x=Model)) + geom_errorbar(position=position_dodge2(width=.1),width=.1) + geom_point(position=position_dodge2(width=.2)) +
  geom_hline(alpha=.5, yintercept=0) +coord_flip() +theme_minimal() + ylab("Increase in Turnout From Doubling the Minimum Wage")  + theme_tufte() + xlab('')
dev.off()

###################################################################################
###########################4 -- State Level Plot ##################################
###################################################################################
minKT <- read.csv("Data/state_min_wages_19802016_complete.csv", stringsAsFactors = F)
df.state  <- minKT[,c('year', 'st', 'statemin')]

min <- read.csv("Data/DOL Min Wage.csv", stringsAsFactors=F)
fed_min <- as.numeric(min[1,])
fed_years <- (gsub("X", "", colnames(min)))
fed_min <- data.frame(fed_min=fed_min, year=fed_years)
df.state <- merge(df.state, fed_min, all.x = T)

df.state$fed_min[df.state$year == 1986] <- 3.35
df.state$fed_min[df.state$year == 1990] <- 3.80
df.state$fed_min[df.state$year == 1984] <- 3.35
df.state$fed_min[df.state$year == 1992] <- 4.25


df.state <- df.state[!(df.state$year %in% seq(1982, 2014, by=4)),]
colnames(df.state) <-  c('year', 'state.abv', 'statemin', 'fedmin')

df.lag <- df.state
df.lag$year <- df.lag$year + 4


colnames(df.lag) <-  c('year', 'state.abv', 'minLag', 'fedLag')

df.state <-  left_join(df.state, df.lag, by=c('year', 'state.abv'))


df.state$d.min <- df.state$statemin - df.state$minLag
df.state$ratio <- df.state$d.min/df.state$minLag
mean(df.state$ratio, na.rm=T)


rat_name <- "Min Wage \\% Increase"
df.state$FedIncrease <- (df.state$fedmin == df.state$statemin) & (df.state$fedmin > df.state$fedLag)


to_plot <- group_by(df.state, year, ratio) %>%
  summarize(n=length(ratio), FedIncrease=mean(FedIncrease)==1)


pdf('Figures/IncreaseByYear.pdf')
ggplot(data=to_plot[to_plot$year != 1980,], aes(x=year, y=ratio, size=n, color=FedIncrease)) + geom_point() +
  xlab("Year") + ylab('Proportional Increase in Min Wage') +labs(size="Number of States", fill="Due to Federal Increase") + theme_minimal()
dev.off()

